nusmods at https://api.nusmods.com/..JSON format, convert to a dataframe.myjson <- fromJSON(file = url("https://api.nusmods.com/corsBiddingStatsRaw.json")) # read data directly from URL
myBid <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myBid
{
if(myjson[[r]]$Semester == 1 | myjson[[r]]$Semester == 2) # if semester 1 or 2
{
myBid <- rbind(myBid, myjson[[r]]) # add to dataframe
}
myjson[[r]] <- NA # free up some RAM
}
saveRDS(myBid, file = "myBid.RDS") # save to directorymyBid.RDSmyBid.RDS and load it directly from my local folder while I worked on the project.myModInfo <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(year in c(2011:2018)) # looping through each year
{
for(semester in c(1,2))
{
# create the url where data is to be extracted from
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/", semester, "/moduleTimetableDeltaRaw.json")
myjson <- fromJSON(file = url(myurl))
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myModInfo
{
if(isTRUE(str_detect(myjson[[r]]$ModuleCode, "^PL"))) # only keep info if module code begins with PL
{
if(myjson[[r]]$Semester == 1 | myjson[[r]]$Semester == 2) # only get semester 1 and 2 information
{
myModInfo <- rbind(myModInfo, myjson[[r]]) # add to dataframe
}
}
myjson[[r]] <- NA # replace the element with NA to free up some rAM
}
cat(year, "Semester", semester, "Done!") # progress tracker
}
}
myTitles <- data.frame() # create empty dataframe which will act as a container to be populated with data
for(year in c(2014:2018)) # looping through each year
{
myurl <- paste0("https://api.nusmods.com/", year, "-", year + 1, "/moduleList.json") # create the url where data is to be extracted from
myjson <- fromJSON(file = url(myurl))
for(r in 1:length(myjson)) # for each element in the myjson list, append it to myModInfo
{
if(isTRUE(str_detect(myjson[[r]]$ModuleCode, "^PL"))) # only keep info if module code begins with PL
{
if(paste0(myjson[[r]]$Semester, collapse = "|") == "1"|
paste0(myjson[[r]]$Semester, collapse = "|") == "2"|
paste0(myjson[[r]]$Semester, collapse = "|") == "1|2") # only keep information from semester 1 and 2
{
myTitles <- rbind(myTitles, as.data.frame(myjson[[r]])) # add to dataframe
}
}
myjson[[r]] <- NA # free RAM
}
}
myModInfo <- myTitles %>% # add titles information to myModInfo
select(ModuleCode, ModuleTitle) %>% # select these two columns
filter(ModuleTitle != "Lab in Applied Psychology") %>%
distinct() %>% # remove duplicates
right_join(myModInfo, by = "ModuleCode") # left = myTitles, right = myModInfo
saveRDS(myModInfo, file = "myModInfo.RDS") # save to directorymyModInfo.RDSmyModInfo.RDS and load the data directly while I worked on the project.myModInfo.
myModInfo <- myModInfo %>%
select(-LastModified, -LastModified_js, -isDelete) %>% # remove these columns
filter(str_detect(ModuleCode, "^PL")) %>% # removing non-Psychology modules
filter(!is.na(ModuleTitle)) %>% # removing modules without module titles #PL3285, PL4220, PL4217
filter(LessonType != "TUTORIAL") %>% # removing information about tutorials
select(AcadYear, Semester, ModuleCode, ModuleTitle, DayText, StartTime, Semester) %>% # select these columns
distinct() # remove duplicates
head(myModInfo) # peekmyBidmyBid.
myBid <- myBid %>%
filter(str_detect(ModuleCode, "^PL")) %>% # removing non-Psychology modules
filter(!str_detect(ModuleCode, "PLS|PLB")) %>% # remove PLS and PLB modules
filter(!str_detect(StudentAcctType, "Reserved")) %>% # remove reserved rounds
filter(!str_detect(StudentAcctType, "[G]")) %>% # remove bidding information from non-psychology students
filter(!str_detect(paste0(unique(myBid$ModuleCode[grep("2",myBid$Group)]), collapse = "|"), ModuleCode)) %>% # remove modules that have more than one lecture
select(-Faculty, -Group) # remove these columns
head(myBid) # peekmyModInfo and myBid.# transform these columns to numeric
for(r in c("Quota", "Bidders", "LowestBid", "LowestSuccessfulBid", "HighestBid", "StartTime"))
{
mydata[,grep(r, names(mydata))] <- as.numeric(mydata[,grep(r, names(mydata))])
}
# transform these columns to factors
for(r in c("AcadYear", "Semester", "ModuleCode", "Round", "StudentAcctType", "DayText", "StudentAcctType", "ModuleTitle"))
{
mydata[,grep(r, names(mydata))] <- factor(mydata[,grep(r, names(mydata))])
}DayText LevelsStudentAcctType Levels# create new variable that indicates the level of the module, based on their module code
mydata$Level <- factor(ifelse(str_detect(mydata$ModuleCode, "1[0-9][0-9][0-9]"), "Level 1",
ifelse(str_detect(mydata$ModuleCode, "2[0-9][0-9][0-9]"), "Level 2",
ifelse(str_detect(mydata$ModuleCode, "3[0-9][0-9][0-9]"), "Level 3",
ifelse(str_detect(mydata$ModuleCode, "4[0-9][0-9][0-9]"), "Level 4",
"Graduate Module")))))# create vector of the column names which are factors
facnames <- mydata %>% select_if(is.factor) %>% names()
# facnames without ModuleCode and StudentAcctType
facnames.mod <- facnames[-grep("ModuleCode|ModuleTitle", facnames)]
# create vector ofthe column names which are numeric
numnames <- mydata %>% select_if(is.numeric) %>% names()
# numnames without StartTime
numnames.time <- names(select_if(mydata, is.numeric))[-grep("StartTime", numnames)]Bidders is calculated across all academic years, all bidding rounds, all modules…## 'data.frame': 1621 obs. of 16 variables:
## $ AcadYear : Factor w/ 8 levels "2011/2012","2012/2013",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Semester : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ Round : Factor w/ 7 levels "1A","1B","1C",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ ModuleCode : Factor w/ 76 levels "PL2131","PL2132",..: 1 1 2 2 3 3 4 4 5 5 ...
## $ Quota : num 5 12 35 35 28 50 25 22 25 30 ...
## $ Bidders : num 3 42 8 3 7 2 8 5 3 3 ...
## $ LowestBid : num 1 205 1 1 1 1 1 1 1 1 ...
## $ LowestSuccessfulBid: num 1 977 1 1 1 1 1 1 1 1 ...
## $ HighestBid : num 368 1255 500 250 1200 ...
## $ StudentAcctType : Factor w/ 4 levels "New[P]","NUS[P]",..: 3 1 3 1 3 1 3 1 3 1 ...
## $ ModuleTitle : Factor w/ 74 levels "Abnormal Psychology",..: 64 64 65 65 7 7 12 12 16 16 ...
## $ DayText : Factor w/ 5 levels "Monday","Tuesday",..: 3 3 2 2 2 2 3 3 1 1 ...
## $ StartTime : num 1600 1600 800 800 1200 1200 1400 1400 1400 1400 ...
## $ Level : Factor w/ 3 levels "Level 2","Level 3",..: 1 1 1 1 2 2 2 2 2 2 ...
## $ BidPerQuota : num 0.6 3.5 0.2286 0.0857 0.25 ...
## $ Period : Factor w/ 2 levels "Morning",">=Afternoon": 2 2 1 1 2 2 2 2 2 2 ...
# plot the categorical variables
for(r in facnames.mod)
{
plot(
ggplot(data = mydata, aes_string(x = r, fill = r)) +
geom_histogram(stat = "count") +
ylab("Count") +
ggtitle(paste0("Count of ", r)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90),
axis.title.x = element_blank(),
legend.position = "none")
)
}# plot the continuous variables
for(r in numnames)
{
plot(
ggplot(data = mydata, aes_string(x = r, fill = r)) +
geom_histogram(fill = "violetred", alpha = 0.5, bins = 50) +
ylab("Histogram") +
ggtitle(paste0(r)) +
theme_classic() +
theme(axis.text.x = element_text(angle = 90),
axis.title.x = element_text())
)
}for(r in 1:length(facnames.mod)) # loop across all factors
{
for(i in 1:length(facnames.mod)) # inner loop
{
if(i == r | i < r)
{ # dont do anything if they are the same or the graph has been made before
} else {
tempform <- paste0("~ ", facnames.mod[r], " + ", facnames.mod[i]) # create formula for xtabs
# temp is a dataframe that is only going to exist in this section and overwritten with each loop
temp <- as.data.frame(xtabs(eval(parse(text = tempform)),
data = mydata,
subset = NULL))
plot(
ggplot(data = temp, aes_string(x = facnames.mod[r], y = facnames.mod[i], fill = "Freq", label = "Freq")) +
geom_tile() +
geom_text() +
scale_fill_gradient(low = "white", high = "violetred") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90),
legend.position = "none")
)
}
}
}for(r in 1:length(numnames)) # loop across all numeric columns
{
for(i in 1:length(numnames)) # inner loop
{
if(i == r | i < r)
{ # dont do anything if they are the same or the graph has been made before
} else {
# create formulas for lm()
tempform.std <- paste0("scale(", numnames[i],")", " ~ ", "scale(", numnames[r], ")") # standardized
tempform <- paste0(numnames[i], " ~ ", numnames[r]) # unstandardized
# regress to get best fit line
stdreg <- lm(eval(parse(text = tempform.std)),
data = mydata) # standardized
reg <- lm(eval(parse(text = tempform)),
data = mydata) # unstandardized
plot(
ggplot(data = mydata, aes_string(x = numnames[r], y = numnames[i])) +
geom_point(color = "violetred", size = 2, alpha = 0.3) +
theme_classic() +
geom_abline(slope = reg$coefficients[2], intercept = reg$coefficients[1], lty = "dashed") +
geom_label(aes(x = Inf, y = Inf, label = paste0("Standardized Regression Coefficient = ",
round(stdreg$coefficients[2],3)),
hjust = 1, vjust = 2)) +
theme(axis.text.x = element_text(angle = 90))
)
}
}
}for(r in facnames.mod) # loop across all factor columns
{
for(i in numnames) # inner loop across all numeric columns
{
plot(
ggplot(data = mydata, aes_string(x = r, y = i, fill = r)) +
geom_boxplot() +
theme_classic() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90))
)
}
}Is it true that it is easier to bid for a module in the morning?
Do bids become higher as the rounds get later?
What are the most and least popular modules?
ggplotly(
mydata %>%
filter(Level == "Level 4") %>%
filter(Round == "1A") %>%
group_by(ModuleCode, ModuleTitle) %>%
summarize(LSB.avg.sem.years = mean(LowestSuccessfulBid)) %>%
ungroup() %>%
mutate(ModuleCode = fct_reorder(ModuleCode, LSB.avg.sem.years)) %>%
ggplot(mapping = aes(x = ModuleCode, y = LSB.avg.sem.years, label = ModuleTitle, fill = LSB.avg.sem.years)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic() +
theme(legend.position = "none")
, tooltip = c("x", "label", "y")
, height = 600, width = 400
)ggplotly(
mydata %>%
filter(Level == "Level 4") %>%
filter(Round == "1A") %>%
group_by(ModuleCode, ModuleTitle) %>%
summarize(BPQ.avg.sem.years = mean(BidPerQuota)) %>%
ungroup() %>%
mutate(ModuleCode = fct_reorder(ModuleCode, BPQ.avg.sem.years)) %>%
ggplot(mapping = aes(x = ModuleCode, y = BPQ.avg.sem.years, label = ModuleTitle, fill = BPQ.avg.sem.years)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic() +
theme(legend.position = "none")
, tooltip = c("x", "label", "y")
, height = 600, width = 400
)ggplotly(
mydata %>%
filter(Level == "Level 4") %>%
filter(Round == "1A") %>%
group_by(ModuleCode, ModuleTitle) %>%
summarize(quota.avg.sem.years = mean(Quota)) %>%
ungroup() %>%
mutate(ModuleCode = fct_reorder(ModuleCode, quota.avg.sem.years)) %>%
ggplot(mapping = aes(x = ModuleCode, y = quota.avg.sem.years, label = ModuleTitle, fill = quota.avg.sem.years)) +
geom_bar(stat = "identity") +
coord_flip() +
theme_classic() +
theme(legend.position = "none")
, tooltip = c("x", "label", "y")
, height = 600, width = 400
)Lets look at each module and compare the average number of bidders, bidders per quota and lowest successful bids when the lecture begins in and after the morning.
for(r in c("meanBidders", "meanBpQ", "meanLSB"))
{
plot(mydata %>%
group_by(ModuleCode, ModuleTitle, Period) %>%
summarise(meanBidders = mean(Bidders), meanBpQ = mean(BidPerQuota), meanLSB = mean(LowestSuccessfulBid),
sdBidders = sd(Bidders), sdBpQ = sd(BidPerQuota), sdLSB = mean(LowestSuccessfulBid)) %>%
ggplot(aes_string(x = "Period", y = r, fill = "Period")) +
geom_bar(stat = "identity") +
theme_classic() +
theme(axis.text.x = element_blank(),
legend.position = "top",
strip.background = element_rect(fill = "grey30", linetype = "blank"),
strip.text = element_text(color = "white", size = 9)) +
facet_wrap(~ ModuleCode:ModuleTitle, labeller = label_wrap_gen(width = 25)) +
ggtitle(r))
}for(r in c("meanBidders", "meanBpQ", "meanLSB"))
{
plot(mydata %>%
group_by(Level, Period) %>%
summarise(meanBidders = mean(Bidders), meanBpQ = mean(BidPerQuota), meanLSB = mean(LowestSuccessfulBid),
sdBidders = sd(Bidders), sdBpQ = sd(BidPerQuota), sdLSB = mean(LowestSuccessfulBid)) %>%
ggplot(aes_string(x = "Period", y = r, fill = "Period")) +
geom_bar(stat = "identity") +
theme_classic() +
theme(axis.text.x = element_blank(),
legend.position = "top",
strip.background = element_rect(fill = "grey30", color = "black"),
strip.text = element_text(color = "white", size = 12)) +
facet_wrap(~ Level) +
ggtitle(r))
}StartTimeHrs12 And Group-Mean Centered Hrs12.